library(tidyverse)
library(haven)
library(lme4)
library(lmerTest)
library(mice)
library(broom)

library(marginaleffects)

# read in data
hkcs_23 <- read_sas("hkcs2023_hs_public_nw.sas7bdat")
hkcs_21 <- read_sas("hkcs2021_hs_public_nw.sas7bdat")

# basic data cleaning for 2023 data
hkcs_23 <- hkcs_23 %>%
  mutate(trans = factor(case_when(GENDER_ID == "A" ~ "Non-trans",
                           GENDER_ID == "B" ~ "Trans",
                           GENDER_ID == "C" ~ "Unsure",
                           TRUE ~ "Missing")),
         phys_dis = case_when(DISABILITY_PHYSICAL == "A" ~ 1,
                              DISABILITY_PHYSICAL == "B" ~ 0),
         ment_dis = case_when(DISABILITY_EMOTIONAL == "A" ~ 1,
                              DISABILITY_EMOTIONAL == "B" ~ 0),
         race_ethnicity = case_when(RACE_ID2 == 2 ~ "Black",
                                    RACE_ID2 == 3 ~ "Asian",
                                    RACE_ID2 == 4 ~ "Hispanic/Latine",
                                    RACE_ID2 %in% c(5, 8) ~ "White",
                                    RACE_ID2 %in% c(1, 6, 9) ~ "Other race",
                                    RACE_ID2 == 10 ~ "Multiracial"),
         mother_ed = case_when(SCHOOL_MOTHER %in% c("A", "B") ~ "LTHS",
                               SCHOOL_MOTHER == "C" ~ "HS",
                               SCHOOL_MOTHER == "D" ~ "Some college",
                             SCHOOL_MOTHER == "E" ~ "College grad",
                             SCHOOL_MOTHER == "F" ~ "Graduate degree"),
         grade = case_when(GRADE == "A" ~ "9th grade",
                           GRADE == "B" ~ "10th grade",
                           GRADE == "C" ~ "11th grade",
                           GRADE == "D" ~ "12th grade",
                           GRADE == "E" ~ "Non-trad"),
         year = "2023",
         bullied_identity = case_when(BULLY_WHERE_NONE == "A" ~ 0,
                                      BULLY_ELEC == "B" ~ 0,
                                      BULLY_TEASED_NONE == "A" ~ 0,
                                      BULLY_TEASED_NONE == "" ~ 1))

# basic data cleaning for 2021 data
hkcs_21 <- hkcs_21 %>%
  mutate(trans = factor(case_when(GENDER_ID == "A" ~ "Non-trans",
                                  GENDER_ID == "B" ~ "Trans",
                                  GENDER_ID == "C" ~ "Unsure",
                                  TRUE ~ "Missing")),
         phys_dis = case_when(DISABILITY_PHYSICAL == "A" ~ 1,
                              DISABILITY_PHYSICAL == "B" ~ 0),
         ment_dis = case_when(DISABILITY_EMOTIONAL == "A" ~ 1,
                              DISABILITY_EMOTIONAL == "B" ~ 0),
         race_ethnicity = case_when(RACE_ID2 == 2 ~ "Black",
                                    RACE_ID2 == 3 ~ "Asian",
                                    RACE_ID2 == 4 ~ "Hispanic/Latine",
                                    RACE_ID2 %in% c(5, 8) ~ "White",
                                    RACE_ID2 %in% c(1, 6, 9) ~ "Other race",
                                    RACE_ID2 == 10 ~ "Multiracial"),
         mother_ed = case_when(SCHOOL_MOTHER %in% c("A", "B") ~ "LTHS",
                               SCHOOL_MOTHER == "C" ~ "HS",
                               SCHOOL_MOTHER == "D" ~ "Some college",
                               SCHOOL_MOTHER == "E" ~ "College grad",
                               SCHOOL_MOTHER == "F" ~ "Graduate degree"),
         grade = case_when(GRADE == "A" ~ "9th grade",
                           GRADE == "B" ~ "10th grade",
                           GRADE == "C" ~ "11th grade",
                           GRADE == "D" ~ "12th grade",
                           GRADE == "E" ~ "Non-trad"),
         year = "2021",
         bullied_identity = case_when(BULLY_SCHOOL == "B" ~ 0,
                                      BULLY_ELEC == "B" ~ 0,
                                      BULLY_TEASED_NONE == "A" ~ 0,
                                      BULLY_TEASED_NONE == "" ~ 1))

# append 2021 to 2023 data
co_schools <- plyr::rbind.fill(hkcs_21, hkcs_23)

# create year-specific unique school id (schoolids are only unique within years)
co_schools <- co_schools %>%
  mutate(unique_school_id = factor(if_else(year == "2021", schoolid * 100 + 21,
                                    schoolid * 100 + 23)))

# aggregate data to school level
school_level <- co_schools %>%
  group_by(unique_school_id) %>%
  summarise(prop_trans = mean(trans == "Trans", na.rm = T),
            prop_dis = mean(phys_dis, na.rm = T),
            prop_mdis = mean(ment_dis, na.rm = T),
            prop_bullied = mean(bullied_identity, na.rm = T),
            prop_white = mean(race_ethnicity == "White", na.rm = T),
            prop_college = mean(mother_ed %in% c("College grad", "Graduate degree"), na.rm = T))

# save descriptive statistics at school level
sd_dis <- sd(school_level$prop_dis, na.rm = T)
mean_dis <- mean(school_level$prop_dis, na.rm = T)

sd_mdis <- sd(school_level$prop_mdis, na.rm = T)
mean_mdis <- mean(school_level$prop_mdis, na.rm = T)

sd_white <- sd(school_level$prop_white, na.rm = T)
mean_white <- mean(school_level$prop_white, na.rm = T)

sd_bullied <- sd(school_level$prop_bullied, na.rm = T)
mean_bullied <- mean(school_level$prop_bullied, na.rm = T)

sd_college <- sd(school_level$prop_college, na.rm = T)
mean_college <- mean(school_level$prop_college, na.rm = T)

# standardize school-level variables for easier interpretation of regression coefs
school_level <- school_level %>%
  mutate(prop_dis_c = (prop_dis - mean_dis)/(sd_dis),
         prop_mdis_c = (prop_mdis - mean_mdis)/(sd_mdis),
         prop_white_c = (prop_white - mean_white)/(sd_white),
         prop_bullied_c = (prop_bullied - mean_bullied)/(sd_bullied),
         prop_college_c = (prop_college - mean_college)/(sd_college))

# merge student-level data with school-level data
student_merged <- left_join(co_schools, school_level, join_by(unique_school_id))

# create three binary variables for transgender status--"conservative," "exclusive," and "inclusive"
student_merged <- student_merged %>%
  mutate(trans_cons = case_when(trans == "Non-trans" ~ 0,
                                trans == "Trans" ~ 1),
         trans_exc = case_when(trans == "Non-trans" ~ 0,
                               trans == "Unsure" ~ 0,
                               trans == "Trans" ~ 1),
         trans_inc = case_when(trans == "Non-trans" ~ 0,
                               trans == "Unsure" ~ 1,
                               trans == "Trans" ~ 1)) %>%
  filter(grade != "Non-trad") # remove non-traditional students who might be much older than other students


student_merged <- student_merged %>% # different weighting system for each year, but normalization addresses this
  mutate(weights = case_when(!is.na(WT_RGN) ~ WT_RGN,
                             !is.na(WT_ST) ~ WT_ST),
         norm_weights = weights/mean(weights))

## additional data cleaning
student_merged$race_ethnicity <- factor(student_merged$race_ethnicity,
                                        levels = c("White", "Black", "Hispanic/Latine", "Asian", "Other race", "Multiracial"))


student_merged <- student_merged %>%
  mutate(phys_dis_fac = factor(phys_dis, levels = c(0,1), labels = c("No", "Yes")),
         ment_dis_fac = factor(ment_dis, levels = c(0,1), labels = c("No", "Yes")),
         mother_ed = factor(mother_ed, levels = c("LTHS", "HS", "Some college",
                                                  "College grad", "Graduate degree")),
         grade = factor(grade, levels = c("9th grade", "10th grade", "11th grade", "12th grade")))

# select variables to put into multiple impuutation model
regression_vars <- student_merged %>%
  select(trans_cons, grade, race_ethnicity, norm_weights,
         mother_ed, phys_dis, prop_dis_c, ment_dis, prop_mdis_c, unique_school_id,
         phys_dis_fac, ment_dis_fac, prop_white_c, prop_bullied_c, prop_college_c)

## Table 5
summary_stats <- regression_vars %>%
  filter(!is.na(race_ethnicity), !is.na(mother_ed), !is.na(phys_dis_fac), !is.na(ment_dis_fac))

wtpct <- function(x, y) sum(x, na.rm = TRUE) / sum(y, na.rm = T) * 100

datasummary(grade + race_ethnicity + mother_ed + phys_dis_fac + ment_dis_fac + 1 ~ trans*(N + norm_weights*Percent(fn = wtpct, denom = "col")),
                    data = summary_stats,
            output = "hkcs_summary.html")

## get predictor matrix
imp <- mice(regression_vars, maxit = 0)

predM <- imp$predictorMatrix

# don't use school id or weights to predict other variables
predM[,c("unique_school_id", "norm_weights")] <- 0

# create 5 imputed data sets
imp2 <- mice(regression_vars, m = 5, 
             predictorMatrix = predM, 
             seed = 54321,
             method = "pmm", print =  FALSE)

complete <- mice::complete(imp2, action = "long", include = T)

# delete imputed outcome variables (see von Hippel)
complete_del <- complete[complete$`.id` %in% which(!is.na(regression_vars$trans_cons)),]

# calculate school-level proportion with disabilities from centered version for
# interpretable figures
complete_del$prop_dis <- complete_del$prop_dis_c*sd_dis + mean_dis
complete_del$prop_mdis <- complete_del$prop_mdis_c*sd_mdis + mean_mdis
complete_mids <- as.mids(complete_del)

# fit glmer model (model 1) with imputed data sets
fitimp <- with(complete_mids, glmer(trans_cons ~ grade + race_ethnicity + mother_ed + phys_dis*prop_dis_c+ (1 | unique_school_id),
                               family = binomial(link = "logit"),
                               weights = norm_weights,
                               control = glmerControl(optimizer = "bobyqa",
                                                      optCtrl=list(maxfun=2e5))))

library(broom.mixed)

model1 <- pool(fitimp$analyses)

# fit model 2 with imputed data sets
fitimp2 <- with(complete_mids, glmer(trans_cons ~ grade + race_ethnicity + mother_ed + phys_dis + ment_dis*prop_mdis_c + (1 | unique_school_id),
                                     family = binomial(link = "logit"),
                                     weights = norm_weights,
                                     control = glmerControl(optimizer = "bobyqa",
                                                            optCtrl=list(maxfun=2e5))))

model2 <- pool(fitimp2$analyses)

# fit model 3 with imputed data sets
fitimp3 <- with(complete_mids, glmer(trans_cons ~ grade + mother_ed + phys_dis*prop_dis_c + phys_dis*prop_white_c + phys_dis*prop_college_c + phys_dis*prop_bullied_c + (1 | unique_school_id),
                                     weights = norm_weights,
                                     control = glmerControl(optimizer = "bobyqa",
                                                            optCtrl=list(maxfun=2e5)),
                                     family = binomial(link = "logit")))

model3 <- pool(fitimp3$analyses)

# create tidy model objects for modelsummary
mod1 <- list(
  tidy = broom.mixed::tidy(model1),
  glance = broom.mixed::glance(model1))

mod2 <- list(
  tidy = broom.mixed::tidy(model2),
  glance = broom.mixed::glance(model2))

mod3 <- list(
  tidy = broom.mixed::tidy(model3),
  glance = broom.mixed::glance(model3))

# select and rename coefficients to display
coefs <- c("phys_dis" = "Physical disability (student)",
           "prop_dis_c" = "Proportion of students with physical disabilities",
           "phys_dis:prop_dis_c" = "Physical disability x Proportion of students
            with physical disabilities",
           "ment_dis" = "Mental disability (student)",
           "prop_mdis_c" = "Proportion of students with mental disabilities",
           "ment_dis:prop_mdis_c" = "Mental disability x Proportion of students
           with mental disabilities",
           "prop_white_c" = "Proportion of White students",
           "prop_college_c" = "Proportion of students whose mothers have a college degree",
           "prop_bullied_c" = "Proportion of students who have been bullied based on identity characteristics",
           "phys_dis:prop_white_c" = "Physical disability x Proportion of White students",
           "phys_dis:prop_college_c" = "Physical disability x Proportion of students whose mothers have a 
           college degree",
           "phys_dis:prop_bullied_c" = "Physical disability x Proportion of students who have been
           bullied based on identity characteristics")

# tell modelsummary that these are models
class(mod1) <- "modelsummary_list"
class(mod2) <- "modelsummary_list"
class(mod3) <- "modelsummary_list"

# Table 6
modelsummary::modelsummary(list("Model 1" = mod1, "Model 2" = mod2, "Model 3" = mod3),
                           stars = T,
                           output = "hkcs_models.html",
                           coef_map = coefs,
                           title = "Probability of Identifying as Transgender by Student Characteristics and School Context",
                           notes = c("Source: Healthy Kids Colorado Survey, 2021 and 2023 \nl
                                     Logistic regression coefficients (standard errors) from generalized linear mixed models \nl
                                     Adjusted for mother's educational attainment and grade level"))

### Appendix E1: Exclusive definition of trans

# repeat multiple imputation
regression_vars_exc <- student_merged %>%
  select(trans_exc, grade, race_ethnicity, norm_weights,
         mother_ed, phys_dis, prop_dis_c, ment_dis, prop_mdis_c, unique_school_id,
         phys_dis_fac, ment_dis_fac, prop_white_c, prop_bullied_c, prop_college_c)

imp <- mice(regression_vars_exc, maxit = 0)

predM <- imp$predictorMatrix

predM[,c("unique_school_id", "norm_weights")] <- 0

imp2 <- mice(regression_vars_exc, m = 5, 
             predictorMatrix = predM, 
             seed = 54321,
             method = "pmm", print =  FALSE)


complete_exc <- mice::complete(imp2, action = "long", include = T)

complete_exc_del <- complete_exc[complete_exc$`.id` %in% which(!is.na(regression_vars_exc$trans_exc)),]
complete_mids_exc <- as.mids(complete_exc_del)

# refit model 1 with imputed data
fitimp_exc <- with(complete_mids_exc, glmer(trans_exc ~ grade + race_ethnicity + mother_ed + phys_dis*prop_dis_c+ (1 | unique_school_id),
                                    family = binomial(link = "logit"),
                                    weights = norm_weights,
                                    control = glmerControl(optimizer = "bobyqa",
                                                           optCtrl=list(maxfun=2e5))))

model1_exc <- pool(fitimp_exc$analyses)

# refit model 2 with imputed data
fitimp2_exc <- with(complete_mids_exc, glmer(trans_exc ~ grade + race_ethnicity + mother_ed + phys_dis + ment_dis*prop_mdis_c + (1 | unique_school_id),
                                     family = binomial(link = "logit"),
                                     weights = norm_weights,
                                     control = glmerControl(optimizer = "bobyqa",
                                                            optCtrl=list(maxfun=2e5))))

model2_exc <- pool(fitimp2_exc$analyses)

# refit model 3 with imputed data
fitimp3_exc <- with(complete_mids_exc, glmer(trans_exc ~ grade + mother_ed + phys_dis*prop_dis_c + phys_dis*prop_white_c + phys_dis*prop_college_c + phys_dis*prop_bullied_c + (1 | unique_school_id),
                                     weights = norm_weights,
                                     control = glmerControl(optimizer = "bobyqa",
                                                            optCtrl=list(maxfun=2e5)),
                                     family = binomial(link = "logit")))

model3_exc <- pool(fitimp3_exc$analyses)

# create tidy model objects for modelsummary
mod1_exc <- list(
  tidy = broom.mixed::tidy(model1_exc),
  gla`nce = broom.mixed::glance(model1_exc))

mod2_exc <- list(
  tidy = broom.mixed::tidy(model2_exc),
  glance = broom.mixed::glance(model2_exc))

mod3_exc <- list(
  tidy = broom.mixed::tidy(model3_exc),
  glance = broom.mixed::glance(model3_exc))


class(mod1_exc) <- "modelsummary_list"
class(mod2_exc) <- "modelsummary_list"
class(mod3_exc) <- "modelsummary_list"

# create Table E1
modelsummary::modelsummary(list("Model 1" = mod1_exc, "Model 2" = mod2_exc, "Model 3" = mod3_exc),
                           stars = T,
                           output = "hkcs_models_app_c.html",
                           coef_map = coefs,
                           title = "Probability of Identifying as Transgender by Student Characteristics and School Context",
                           notes = c("Source: Healthy Kids Colorado Survey, 2021 and 2023 \nl
                                     Logistic regression coefficients (standard errors) from generalized linear mixed models \nl
                                     Adjusted for mother's educational attainment and grade level"))


### Appendix E: Inclusive definition of trans

# impute variables again
regression_vars_inc <- student_merged %>%
  select(trans_inc, grade, race_ethnicity, norm_weights,
         mother_ed, phys_dis, prop_dis_c, ment_dis, prop_mdis_c, unique_school_id,
         phys_dis_fac, ment_dis_fac, prop_white_c, prop_bullied_c, prop_college_c)

imp <- mice(regression_vars_inc, maxit = 0)

predM <- imp$predictorMatrix

predM[,c("unique_school_id", "norm_weights")] <- 0


imp2 <- mice(regression_vars_inc, m = 5, 
             predictorMatrix = predM, 
             seed = 54321,
             method = "pmm", print =  FALSE)


complete_inc <- mice::complete(imp2, action = "long", include = T)

complete_inc_del <- complete_inc[complete_inc$`.id` %in% which(!is.na(regression_vars_inc$trans_inc)),]
complete_mids_inc <- as.mids(complete_inc_del)


# refit model 1
fitimp_inc <- with(complete_mids_inc, glmer(trans_inc ~ grade + race_ethnicity + mother_ed + phys_dis*prop_dis_c+ (1 | unique_school_id),
                                            family = binomial(link = "logit"),
                                            weights = norm_weights,
                                            control = glmerControl(optimizer = "bobyqa",
                                                                   optCtrl=list(maxfun=2e5))))

model1_inc <- pool(fitimp_inc$analyses)

# refit model 2
fitimp2_inc <- with(complete_mids_inc, glmer(trans_inc ~ grade + race_ethnicity + mother_ed + phys_dis + ment_dis*prop_mdis_c + (1 | unique_school_id),
                                             family = binomial(link = "logit"),
                                             weights = norm_weights,
                                             control = glmerControl(optimizer = "bobyqa",
                                                                    optCtrl=list(maxfun=2e5))))

model2_inc <- pool(fitimp2_inc$analyses)

# refit model 3
fitimp3_inc <- with(complete_mids_inc, glmer(trans_inc ~ grade + mother_ed + phys_dis*prop_dis_c + phys_dis*prop_white_c + phys_dis*prop_college_c + phys_dis*prop_bullied_c + (1 | unique_school_id),
                                             weights = norm_weights,
                                             control = glmerControl(optimizer = "bobyqa",
                                                                    optCtrl=list(maxfun=2e5)),
                                             family = binomial(link = "logit")))

model3_inc <- pool(fitimp3_inc$analyses)

# make tidy model objects
mod1_inc <- list(
  tidy = broom.mixed::tidy(model1_inc),
  glance = broom.mixed::glance(model1_inc))

mod2_inc <- list(
  tidy = broom.mixed::tidy(model2_inc),
  glance = broom.mixed::glance(model2_inc))

mod3_inc <- list(
  tidy = broom.mixed::tidy(model3_inc),
  glance = broom.mixed::glance(model3_inc))


class(mod1_inc) <- "modelsummary_list"
class(mod2_inc) <- "modelsummary_list"
class(mod3_inc) <- "modelsummary_list"

# make Table E2
modelsummary::modelsummary(list("Model 1" = mod1_inc, "Model 2" = mod2_inc, "Model 3" = mod3_inc),
                           stars = T,
                           output = "hkcs_models_app_c_inc.html",
                           coef_map = coefs,
                           title = "Probability of Identifying as Transgender by Student Characteristics and School Context",
                           notes = c("Source: Healthy Kids Colorado Survey, 2021 and 2023 \nl
                                     Logistic regression coefficients (standard errors) from generalized linear mixed models \nl
                                     Adjusted for mother's educational attainment and grade level"))

## Figure 5

# get marginal predictions from model 1 (physical disability)
emmeans_phys <- emmeans(fitimp, ~ prop_dis_c * phys_dis, CIs = T, type = "response",
      at = list(prop_dis_c = seq(-3,3, length.out = 40),
                grade = "9th grade",
                race_ethnicity = "White",
                mother_ed = "College grad"))

# convert to dataframe
emmeans_phys_df <- as.data.frame(emmeans_phys)

# make variables easier to work with
emmeans_phys_df <- emmeans_phys_df %>%
  mutate(prop_dis = mean_dis + prop_dis_c*sd_dis,
         phys_dis = factor(phys_dis),
         lower = lower.CL,
         upper = upper.CL)

# save marginal predictions
write_csv(emmeans_phys_df, "emmeans_phys_df.csv")

# get marginal predictions for model 2 (mental disability)
emmeans_ment <- emmeans(fitimp2, ~ prop_mdis_c * ment_dis, CIs = T, type = "response",
                        at = list(prop_mdis_c = seq(-3,3, length.out = 40),
                                  grade = "9th grade",
                                  race_ethnicity = "White",
                                  mother_ed = "College grad"))

emmeans_ment_df <- as.data.frame(emmeans_ment)

emmeans_ment_df <- emmeans_ment_df %>%
  mutate(prop_mdis = mean_mdis + prop_mdis_c*sd_mdis,
         ment_dis_fac = factor(ment_dis, levels = c(0,1), labels = c("No", "Yes")),
         lower = lower.CL,
         upper = upper.CL)
# save predictions
write_csv(emmeans_ment_df, "emmeans_ment_df.csv")


## diagnostics: are schools with steeper slopes for disability also schools
## with few students with disabilities?


student_complete <- student_merged %>% # remove nas
  filter(!is.na(trans_cons), !is.na(ment_dis), !is.na(phys_dis), !is.na(unique_school_id))

separate_models_phys <- lmList(trans_cons ~ phys_dis | unique_school_id,
                          data = student_complete,
                          family = binomial(link="logit")) # fit a separate glm for each school

separate_models_ment <- lmList(trans_cons ~ ment_dis | unique_school_id,
                          data = student_complete,
                          family = binomial(link="logit")) # fit a separate glm for each school

head(coef(separate_models_phys))
head(coef(separate_models_ment))

school_level <- school_level[which(school_level$unique_school_id %in% unique(student_complete$unique_school_id)),]

school_level_data <- cbind(coef(separate_models_phys), coef(separate_models_ment), school_level)

colnames(school_level_data)[1] <- "intercept_phys"
colnames(school_level_data)[3] <- "intercept_ment"


p1 <- school_level_data %>%
  filter(prop_dis < 0.3, abs(phys_dis) < 10) %>% # remove schools with unstable estimates
  ggplot(aes(x = prop_dis, y = phys_dis)) +
  geom_point() +
  geom_smooth(method = "lm")

p2 <- school_level_data %>%
  filter(prop_mdis < 0.6, abs(ment_dis) < 10) %>% # remove schools with unstable estimates
  ggplot(aes(x = prop_mdis, y = ment_dis)) +
  geom_point() +
  geom_smooth(method = "lm")

p1 # x axis is prop of students with physical disabilities, y axis is slope for physical disability
p2 # x axis is prop of students with mental disabilities, y axis is slope for mental disability
